To run this code in my project using the renv environment, run the following lines of code
install.packages("renv") #install the package on the new computer (may not be necessary if renv bootstraps itself as expected)
renv::restore() #reinstall all the package versions in the renv lockfile
renv::install("bioc::genefilter")
renv::install("DESeq2")
renv::install("bioc::apeglm")
renv::install("ashr")
renv::install("ggplot2")
renv::install("vsn")
renv::install("hexbin")
renv::install("pheatmap")
renv::install("RColorBrewer")
renv::install("bioc::EnhancedVolcano")
renv::install("tidyverse")
library(genefilter)
library(DESeq2)
## Warning: package 'DESeq2' was built under R version 4.3.3
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## Warning: package 'matrixStats' was built under R version 4.3.3
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:genefilter':
##
## rowSds, rowVars
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## The following objects are masked from 'package:genefilter':
##
## rowSds, rowVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library(apeglm)
library(ashr)
library(ggplot2)
library(vsn)
library(hexbin)
## Warning: package 'hexbin' was built under R version 4.3.3
library(pheatmap)
library(RColorBrewer)
library(EnhancedVolcano)
## Loading required package: ggrepel
## Warning: package 'ggrepel' was built under R version 4.3.3
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse() masks IRanges::collapse()
## ✖ dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count() masks matrixStats::count()
## ✖ dplyr::desc() masks IRanges::desc()
## ✖ tidyr::expand() masks S4Vectors::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks S4Vectors::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename() masks S4Vectors::rename()
## ✖ lubridate::second() masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::slice() masks IRanges::slice()
## ✖ readr::spec() masks genefilter::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
sessionInfo() #provides list of loaded packages and version of R.
## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.0
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices datasets utils methods
## [8] base
##
## other attached packages:
## [1] lubridate_1.9.3 forcats_1.0.0
## [3] stringr_1.5.1 dplyr_1.1.4
## [5] purrr_1.0.2 readr_2.1.5
## [7] tidyr_1.3.1 tibble_3.2.1
## [9] tidyverse_2.0.0 EnhancedVolcano_1.20.0
## [11] ggrepel_0.9.6 RColorBrewer_1.1-3
## [13] pheatmap_1.0.12 hexbin_1.28.4
## [15] vsn_3.70.0 ggplot2_3.5.1
## [17] ashr_2.2-63 apeglm_1.24.0
## [19] DESeq2_1.42.1 SummarizedExperiment_1.30.2
## [21] Biobase_2.60.0 MatrixGenerics_1.12.3
## [23] matrixStats_1.4.1 GenomicRanges_1.52.1
## [25] GenomeInfoDb_1.36.4 IRanges_2.34.1
## [27] S4Vectors_0.38.2 BiocGenerics_0.46.0
## [29] genefilter_1.84.0
##
## loaded via a namespace (and not attached):
## [1] DBI_1.2.3 bitops_1.0-8 rlang_1.1.4
## [4] magrittr_2.0.3 compiler_4.3.2 RSQLite_2.3.7
## [7] png_0.1-8 vctrs_0.6.5 pkgconfig_2.0.3
## [10] crayon_1.5.3 fastmap_1.2.0 XVector_0.40.0
## [13] utf8_1.2.4 rmarkdown_2.28 tzdb_0.4.0
## [16] preprocessCore_1.64.0 bit_4.5.0 xfun_0.47
## [19] zlibbioc_1.46.0 cachem_1.1.0 jsonlite_1.8.9
## [22] blob_1.2.4 DelayedArray_0.26.7 BiocParallel_1.34.2
## [25] irlba_2.3.5.1 parallel_4.3.2 R6_2.5.1
## [28] stringi_1.8.4 bslib_0.8.0 limma_3.58.1
## [31] SQUAREM_2021.1 jquerylib_0.1.4 numDeriv_2016.8-1.1
## [34] Rcpp_1.0.13 knitr_1.48 timechange_0.3.0
## [37] Matrix_1.6-5 splines_4.3.2 tidyselect_1.2.1
## [40] rstudioapi_0.16.0 abind_1.4-8 yaml_2.3.10
## [43] codetools_0.2-20 affy_1.80.0 lattice_0.22-6
## [46] plyr_1.8.9 withr_3.0.1 KEGGREST_1.40.1
## [49] coda_0.19-4.1 evaluate_1.0.0 survival_3.7-0
## [52] Biostrings_2.68.1 pillar_1.9.0 affyio_1.72.0
## [55] BiocManager_1.30.25 renv_1.0.10 generics_0.1.3
## [58] invgamma_1.1 RCurl_1.98-1.16 truncnorm_1.0-9
## [61] emdbook_1.3.13 hms_1.1.3 munsell_0.5.1
## [64] scales_1.3.0 xtable_1.8-4 glue_1.8.0
## [67] tools_4.3.2 annotate_1.78.0 locfit_1.5-9.10
## [70] mvtnorm_1.3-1 XML_3.99-0.17 grid_4.3.2
## [73] bbmle_1.0.25.1 bdsmatrix_1.3-7 AnnotationDbi_1.62.2
## [76] colorspace_2.1-1 GenomeInfoDbData_1.2.10 cli_3.6.3
## [79] fansi_1.0.6 mixsqp_0.3-54 S4Arrays_1.0.6
## [82] gtable_0.3.5 sass_0.4.9 digest_0.6.37
## [85] memoise_2.0.1 htmltools_0.5.8.1 lifecycle_1.0.4
## [88] httr_1.4.7 statmod_1.5.0 bit64_4.5.2
## [91] MASS_7.3-60.0.1
Read in raw count data
counts_raw <- read.csv("../output_RNA/stringtie/LCM_RNA_gene_count_matrix.csv") #load in data
rownames(counts_raw) <- counts_raw[,1] #set first column that contains gene names as rownames
counts_raw <- counts_raw[,-1] #remove the column with gene names
gene_id,LCM_15,LCM_16,LCM_20,LCM_21,LCM_26,LCM_27,LCM_4,LCM_5,LCM_8,LCM_9
Read in metadata
meta <- read.csv("../data_RNA/LCM_RNA_metadata.csv")
meta <- dplyr::arrange(meta, Sample)
# Set variables as factors
meta$Tissue <- factor(meta$Tissue, levels = c("OralEpi","Aboral")) #we want OralEpi to be the baseline
meta$Fragment <- factor(meta$Fragment)
meta$Section_Date <- factor(meta$Section_Date)
meta$LCM_Date <- factor(meta$LCM_Date)
Data sanity checks!
all(meta$Sample %in% colnames(counts_raw)) #are all of the sample names in the metadata column names in the gene count matrix? Should be TRUE
## [1] TRUE
all(meta$Sample == colnames(counts_raw)) #are they the same in the same order? Should be TRUE
## [1] TRUE
ffun<-filterfun(pOverA(0.25,5)) #set up filtering parameters
counts_filt_poa <- genefilter((counts_raw), ffun) #apply filter
sum(counts_filt_poa) #count number of genes left
## [1] 15530
filtered_counts <- counts_raw[counts_filt_poa,] #keep only rows that passed filter
There are now 15530 genes in the filtered dataset.
Data sanity checks:
all(meta$Sample %in% colnames(filtered_counts)) #are all of the sample names in the metadata column names in the gene count matrix?
## [1] TRUE
all(meta$Sample == colnames(filtered_counts)) #are they the same in the same order? Should be TRUE
## [1] TRUE
Create DESeq object and run DESeq2
dds <- DESeqDataSetFromMatrix(countData = filtered_counts,
colData = meta,
design= ~ Fragment + Tissue)
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
Extract results for Aboral vs. OralEpi contrast
res <- results(dds, contrast = c("Tissue","Aboral","OralEpi"))
resLFC <- lfcShrink(dds, coef="Tissue_Aboral_vs_OralEpi", res=res, type = "apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## Warning in nbinomGLM(x = x, Y = YNZ, size = size, weights = weightsNZ, offset =
## offsetNZ, : the moving direction increases the objective function value
EnhancedVolcano(res,
lab = rownames(res),
x = 'log2FoldChange',
y = 'pvalue')
EnhancedVolcano(resLFC,
lab = rownames(resLFC),
x = 'log2FoldChange',
y = 'pvalue')
Extract results for adjusted p-value < 0.05
res <- resLFC
resOrdered <- res[order(res$pvalue),]
# save differentially expressed genes
DE_05 <- as.data.frame(resOrdered) %>% filter(padj < 0.05)
DE_05_Up <- DE_05 %>% filter(log2FoldChange > 0) #Higher in Aboral, Lower in OralEpi
DE_05_Down <- DE_05 %>% filter(log2FoldChange < 0) #Lower in Aboral, Higher in OralEpi
DEGs <- rownames(DE_05)
nrow(DE_05)
## [1] 4177
nrow(DE_05_Up) #Higher in Aboral, Lower in OralEpi
## [1] 649
nrow(DE_05_Down) #Lower in Aboral, Higher in OralEpi
## [1] 3528
write.csv(as.data.frame(resOrdered),
file="../output_RNA/differential_expression/DESeq_results.csv")
write.csv(DE_05,
file="../output_RNA/differential_expression/DEG_05.csv")
plotMA(res, ylim=c(-20,20))
plotMA(resLFC, ylim=c(-20,20))
#After calling plotMA, one can use the function identify to interactively detect the row number of individual genes by clicking on the plot. One can then recover the gene identifiers by saving the resulting indices:
#idx <- identify(res$baseMean, res$log2FoldChange)
#rownames(res)[idx]
# because we are interested in the comparison and not the intercept, we set 'coef=2'
resNorm <- lfcShrink(dds, coef=2, type="normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
resAsh <- lfcShrink(dds, coef=2, type="ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-5,5)
plotMA(resLFC, xlim=xlim, ylim=ylim, main="apeglm")
plotMA(resNorm, xlim=xlim, ylim=ylim, main="normal")
plotMA(resAsh, xlim=xlim, ylim=ylim, main="ashr")
plotCounts(dds, gene=which.max(res$log2FoldChange), intgroup="Tissue")
plotCounts(dds, gene=which.min(res$log2FoldChange), intgroup="Tissue")
Transforming count data for visualization
vsd <- vst(dds, blind=FALSE)
rld <- rlog(dds, blind=FALSE)
ntd <- normTransform(dds) # this gives log2(n + 1)
meanSdPlot(assay(vsd), main = "vsd")
## Warning in geom_hex(bins = bins, ...): Ignoring unknown parameters: `main`
meanSdPlot(assay(rld))
meanSdPlot(assay(ntd))
Will move forward with vst transformation for visualizations
df <- as.data.frame(colData(dds)[,c("Tissue","Fragment")])
#view all genes
pheatmap(assay(vsd), cluster_rows=TRUE, show_rownames=FALSE,
cluster_cols=TRUE, cutree_cols = 2,annotation_col=df)
#view highest count genes
select <- order(rowMeans(counts(dds,normalized=TRUE)),
decreasing=TRUE)[1:20]
pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
cluster_cols=TRUE, cutree_cols = 2,annotation_col=df)
#view most significantly differentially expressed genes
select <- order(res$padj)[1:20]
pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
cluster_cols=TRUE, cutree_cols = 2,annotation_col=df)
## Heatmap of the sample-to-sample distances
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$Tissue, vsd$Fragment, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors)
pcaData <- plotPCA(vsd, intgroup=c("Tissue", "Fragment"), returnData=TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=Tissue, shape=Fragment)) +
geom_point(size=3) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed() + theme_bw()
Clearly, the majority of the variance in the data is explained by tissue type!
Download annotation files from genome website
# wget files
wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.Conserved_Domain_Search_results.txt.gz
wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.EggNog_results.txt.gz
wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.KEGG_results.txt.gz
# move to references direcotry
mv *gz ../references
# unzip files
gunzip ../references/*gz
EggNog <- read.delim("../references/Pocillopora_acuta_HIv2.genes.EggNog_results.txt") %>% dplyr::rename("query" = X.query)
CDSearch <- read.delim("../references/Pocillopora_acuta_HIv2.genes.Conserved_Domain_Search_results.txt", quote = "") %>% dplyr::rename("query" = X.Query)
KEGG <- read.delim("../references/Pocillopora_acuta_HIv2.genes.KEGG_results.txt", header = FALSE) %>% dplyr::rename("query" = V1, "KeggTerm" = V2)
DE_05$query <- rownames(DE_05)
DE_05_annot <- DE_05 %>% left_join(CDSearch) %>% select(query,everything())
## Joining with `by = join_by(query)`
DE_05_eggnog <- DE_05 %>% left_join(EggNog) %>% select(query,everything())
## Joining with `by = join_by(query)`
annot_all <- as.data.frame(rownames(dds)) %>% dplyr::rename("query" = `rownames(dds)`) %>% left_join(CDSearch)
## Joining with `by = join_by(query)`
eggnog_all <- as.data.frame(rownames(dds)) %>% dplyr::rename("query" = `rownames(dds)`) %>% left_join(EggNog)
## Joining with `by = join_by(query)`
write.csv(as.data.frame(DE_05_eggnog),
file="../output_RNA/differential_expression/DE_05_eggnog_annotation.csv")
df <- as.data.frame(colData(dds)[,c("Tissue","Fragment")])
gene_labels <- eggnog_all %>% select(query,PFAMs) %>%
mutate_all(~ ifelse(is.na(.), "", .)) %>% #replace NAs with "" for labelling purposes
separate(PFAMs, into = c("PFAMs", "rest of name"), sep = ",(?=.*?,)", extra = "merge")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 13646 rows [1, 2, 3, 4,
## 5, 7, 8, 10, 11, 12, 13, 14, 15, 18, 19, 21, 22, 23, 24, 25, ...].
#view most significantly differentially expressed genes
select <- order(res$padj)[1:50]
pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
cluster_cols=TRUE, cutree_cols = 2,annotation_col=df,
labels_row =gene_labels[select,"PFAMs"], fontsize_row = 5)
#view genes Higher in Aboral, Lower in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange,decreasing = TRUE)[1:50]
pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
cluster_cols=TRUE, cutree_cols = 2,annotation_col=df,
labels_row =gene_labels[select,"PFAMs"], fontsize_row = 5)
#view genes Lower in Aboral, Higher in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange)[1:50]
pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
cluster_cols=TRUE, cutree_cols = 2,annotation_col=df,
labels_row =gene_labels[select,"PFAMs"], fontsize_row = 5)
# Extract the relevant assay data for the selected genes
selected_data <- assay(vsd)[select, ]
# Normalize via z-score (scale across rows)
zscore_data <- t(scale(t(selected_data)))
pheatmap(zscore_data,
cluster_rows = FALSE,
show_rownames = TRUE,
cluster_cols = TRUE,
cutree_cols = 2,
annotation_col = df,
labels_row = gene_labels[select, "PFAMs"],
fontsize_row = 5)
After you’ve confirmed your code works as expected, use renv::snapshot() to record the packages and their sources in the lockfile.”